library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)
op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)
About Dataset Context The principal diagnosis method for SARS-CoV-2 is a PCR technique, which allows for the detection of a genetic material of a pathogen or microorganism and it has high specificity, sensitivity and helps diagnosing even in the first stages of infection. However, it is not the fastest method to use in this situation and it’s very time consuming.
Raman spectroscopy could be used as a cheap and quick method to diagnose infection by SARS-CoV-2.
Content Inside this dataset there’re three files, each containing the raw spectra of healthy and infected subjects, unknown-state subjects and the probe used with saline solution inside. The data was in MATLAB’s proprietary format and was transformed to CSV using Python’s Pandas library.
Source The source of this data was found in the following DOI: https://doi.org/10.6084/m9.figshare.12159924.v1
Acknowledgements If used cite: Yin, Gang; Li, Lintao; Lu, Shun; Yin, Yu; Su, Yuanzhang; Zeng, Yilan; et al. (2020): Data and code on serum Raman spectroscopy as an efficient primary screening of coronavirus disease in 2019 (COVID-19). figshare. Dataset. https://doi.org/10.6084/m9.figshare.12159924.v1
The data to process is described in:
I added a column to the data identifying the repeated experiments.
SalivaIR <- read.csv("~/GitHub/LatentBiomarkers/Data/RAMANCOVID19/covid_and_healthy_spectra.csv")
SalivaIR$X400 <- NULL
SalivaIR$RepID <- rep(c(1:3),103)
SalivaIR_set1 <- subset(SalivaIR,RepID==1)
SalivaIR_set1$RepID <- NULL
diagnos <- SalivaIR_set1$diagnostic
SalivaIR_set1$diagnostic <- NULL
SalivaIR_set2 <- subset(SalivaIR,RepID==2)
SalivaIR_set2$RepID <- NULL
SalivaIR_set2$diagnostic <- NULL
SalivaIR_set3 <- subset(SalivaIR,RepID==3)
SalivaIR_set3$RepID <- NULL
SalivaIR_set3$diagnostic <- NULL
SalivaIR_Avg <- (SalivaIR_set1 + SalivaIR_set2 + SalivaIR_set3)/3
SalivaIR_Avg$class <- 1*(diagnos=="SARS-CoV-2")
pander::pander(table(SalivaIR_Avg$class))
| 0 | 1 |
|---|---|
| 50 | 53 |
studyName <- "IRSaliva_2"
dataframe <- SalivaIR_Avg
outcome <- "class"
TopVariables <- 10
thro <- 0.80
cexheat = 0.15
Some libraries
library(psych)
library(whitening)
library("vioplot")
library("rpart")
pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
| rows | col |
|---|---|
| 103 | 899 |
pander::pander(table(dataframe[,outcome]))
| 0 | 1 |
|---|---|
| 50 | 53 |
varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]
largeSet <- length(varlist) > 1500
Scaling and removing near zero variance columns and highly co-linear(r>0.99999) columns
### Some global cleaning
sdiszero <- apply(dataframe,2,sd) > 1.0e-16
dataframe <- dataframe[,sdiszero]
varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
dataframe <- dataframe[,tokeep]
varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]
iscontinous <- sapply(apply(dataframe,2,unique),length) >= 5 ## Only variables with enough samples
dataframeScaled <- FRESAScale(dataframe,method="OrderLogit")$scaledData
numsub <- nrow(dataframe)
if (numsub > 1000) numsub <- 1000
if (!largeSet)
{
hm <- heatMaps(data=dataframeScaled[1:numsub,],
Outcome=outcome,
Scale=TRUE,
hCluster = "row",
xlab="Feature",
ylab="Sample",
srtCol=45,
srtRow=45,
cexCol=cexheat,
cexRow=cexheat
)
par(op)
}
The heat map of the data
if (!largeSet)
{
par(cex=0.6,cex.main=0.85,cex.axis=0.7)
#cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
cormat <- cor(dataframe[,varlist],method="pearson")
cormat[is.na(cormat)] <- 0
gplots::heatmap.2(abs(cormat),
trace = "none",
# scale = "row",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Original Correlation",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="|Pearson Correlation|",
xlab="Feature", ylab="Feature")
diag(cormat) <- 0
print(max(abs(cormat)))
}
[1]
0.9991348
DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#>
#> X1453
#> X402 X405 X407 X410 X412 X415
#> 0.0741573 0.5325843 0.7382022 0.7337079 0.6910112 0.7101124
#>
#> Included: 890 , Uni p: 0.0001685393 , Base Size: 1 , Rcrit: 0.3464623
#>
#>
1 <R=0.999,thr=0.950>, Top: 78< 44 >[Fa= 78 ]( 78 , 390 , 0 ),<|><>Tot Used: 468 , Added: 390 , Zero Std: 0 , Max Cor: 0.999
#>
2 <R=0.999,thr=0.950>, Top: 63< 7 >[Fa= 141 ]( 63 , 127 , 78 ),<|><>Tot Used: 609 , Added: 127 , Zero Std: 0 , Max Cor: 0.996
#>
3 <R=0.996,thr=0.950>, Top: 23< 3 >[Fa= 164 ]( 23 , 28 , 141 ),<|><>Tot Used: 654 , Added: 28 , Zero Std: 0 , Max Cor: 0.982
#>
4 <R=0.982,thr=0.950>, Top: 4< 2 >[Fa= 168 ]( 4 , 5 , 164 ),<|><>Tot Used: 663 , Added: 5 , Zero Std: 0 , Max Cor: 0.980
#>
5 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 169 ]( 1 , 1 , 168 ),<|><>Tot Used: 665 , Added: 1 , Zero Std: 0 , Max Cor: 0.950
#>
6 <R=0.950,thr=0.900>, Top: 99< 10 >[Fa= 213 ]( 93 , 191 , 169 ),<|><>Tot Used: 754 , Added: 191 , Zero Std: 0 , Max Cor: 0.990
#>
7 <R=0.990,thr=0.950>, Top: 4< 1 >[Fa= 214 ]( 4 , 4 , 213 ),<|><>Tot Used: 754 , Added: 4 , Zero Std: 0 , Max Cor: 0.949
#>
8 <R=0.949,thr=0.900>, Top: 32< 1 >[Fa= 233 ]( 32 , 43 , 214 ),<|><>Tot Used: 787 , Added: 43 , Zero Std: 0 , Max Cor: 0.955
#>
9 <R=0.955,thr=0.950>, Top: 1< 1 >[Fa= 233 ]( 1 , 1 , 233 ),<|><>Tot Used: 788 , Added: 1 , Zero Std: 0 , Max Cor: 0.949
#>
10 <R=0.949,thr=0.900>, Top: 7< 1 >[Fa= 237 ]( 7 , 7 , 233 ),<|><>Tot Used: 797 , Added: 7 , Zero Std: 0 , Max Cor: 0.900
#>
11 <R=0.900,thr=0.800>, Top: 83< 2 >[Fa= 267 ]( 79 , 167 , 237 ),<|><>Tot Used: 833 , Added: 167 , Zero Std: 0 , Max Cor: 0.924
#>
12 <R=0.924,thr=0.900>, Top: 2< 1 >[Fa= 267 ]( 2 , 2 , 267 ),<|><>Tot Used: 834 , Added: 2 , Zero Std: 0 , Max Cor: 0.899
#>
13 <R=0.899,thr=0.800>, Top: 33< 1 >[Fa= 282 ]( 31 , 56 , 267 ),<|><>Tot Used: 860 , Added: 56 , Zero Std: 0 , Max Cor: 0.892
#>
14 <R=0.892,thr=0.800>, Top: 5< 1 >[Fa= 285 ]( 5 , 5 , 282 ),<|><>Tot Used: 866 , Added: 5 , Zero Std: 0 , Max Cor: 0.820
#>
15 <R=0.820,thr=0.800>, Top: 1< 1 >[Fa= 286 ]( 1 , 1 , 285 ),<|><>Tot Used: 866 , Added: 1 , Zero Std: 0 , Max Cor: 0.800
#>
16 <R=0.800,thr=0.800>
#>
[ 16 ], 0.799788 Decor Dimension: 866 Nused: 866 . Cor to Base: 328 , ABase: 890 , Outcome Base: 0
#>
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]
pander::pander(sum(apply(dataframe[,varlist],2,var)))
0.0708
pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))
0.00393
pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))
4.18
pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))
2.97
varratio <- attr(DEdataframe,"VarRatio")
pander::pander(tail(varratio))
| La_X1468 | La_X1455 | La_X1450 | La_X1463 | La_X1451 | La_X1461 |
|---|---|---|---|---|---|
| 0.000696 | 0.000671 | 0.000482 | 0.000462 | 0.000434 | 0.000428 |
if (!largeSet)
{
par(cex=0.6,cex.main=0.85,cex.axis=0.7)
UPLTM <- attr(DEdataframe,"UPLTM")
gplots::heatmap.2(1.0*(abs(UPLTM)>0),
trace = "none",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Decorrelation matrix",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="|Beta|>0",
xlab="Output Feature", ylab="Input Feature")
par(op)
}
Displaying the features associations
par(op)
clustable <- c("To many variables")
transform <- attr(DEdataframe,"UPLTM") != 0
tnames <- colnames(transform)
colnames(transform) <- str_remove_all(colnames(transform),"La_")
transform <- abs(transform*cor(dataframe[,rownames(transform)])) # The weights are proportional to the observed correlation
fscore <- attr(DEdataframe,"fscore")
VertexSize <- fscore # The size depends on the variable independence relevance (fscore)
names(VertexSize) <- str_remove_all(names(VertexSize),"La_")
VertexSize <- 10*(VertexSize-min(VertexSize))/(max(VertexSize)-min(VertexSize)) # Normalization
VertexSize <- VertexSize[rownames(transform)]
rsum <- apply(1*(transform !=0),1,sum) + 0.01*VertexSize + 0.001*varratio[tnames]
csum <- apply(1*(transform !=0),2,sum) + 0.01*VertexSize + 0.001*varratio[tnames]
ntop <- min(10,length(rsum))
topfeatures <- unique(c(names(rsum[order(-rsum)])[1:ntop],names(csum[order(-csum)])[1:ntop]))
rtrans <- transform[topfeatures,]
csum <- (apply(1*(rtrans !=0),2,sum) > 1*(colnames(rtrans) %in% topfeatures))
rtrans <- rtrans[,csum]
topfeatures <- unique(c(topfeatures,colnames(rtrans)))
print(ncol(transform))
[1] 866
transform <- transform[topfeatures,topfeatures]
print(ncol(transform))
[1] 371
if (ncol(transform)>100)
{
csum <- apply(1*(transform !=0),1,sum)
csum <- csum[csum > 1]
csum <- csum + 0.01*VertexSize[names(csum)]
csum <- csum[order(-csum)]
tpsum <- min(20,length(csum))
trsum <- rownames(transform)[rownames(transform) %in% names(csum[1:tpsum])]
rtrans <- transform[trsum,]
topfeatures <- unique(c(rownames(rtrans),colnames(rtrans)))
transform <- transform[topfeatures,topfeatures]
if (nrow(transform) > 150)
{
csum <- apply(1*(rtrans != 0 ),2,sum)
csum <- csum + 0.01*VertexSize[names(csum)]
csum <- csum[order(-csum)]
tpsum <- min(130,length(csum))
csum <- rownames(transform)[rownames(transform) %in% names(csum[1:tpsum])]
csum <- unique(c(trsum,csum))
transform <- transform[csum,csum]
}
print(ncol(transform))
}
[1] 130
if (ncol(transform) < 150)
{
gplots::heatmap.2(transform,
trace = "none",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Red Decorrelation matrix",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="|Beta|>0",
xlab="Output Feature", ylab="Input Feature")
par(op)
VertexSize <- VertexSize[colnames(transform)]
gr <- graph_from_adjacency_matrix(transform,mode = "directed",diag = FALSE,weighted=TRUE)
gr$layout <- layout_with_fr
fc <- cluster_optimal(gr)
plot(fc, gr,
edge.width = 2*E(gr)$weight,
vertex.size=VertexSize,
edge.arrow.size=0.5,
edge.arrow.width=0.5,
vertex.label.cex=(0.15+0.05*VertexSize),
vertex.label.dist=0.5 + 0.05*VertexSize,
main="Top Feature Association")
varratios <- varratio
fscores <- fscore
names(varratios) <- str_remove_all(names(varratios),"La_")
names(fscores) <- str_remove_all(names(fscores),"La_")
dc <- getLatentCoefficients(DEdataframe)
theCharformulas <- attr(dc,"LatentCharFormulas")
clustable <- as.data.frame(cbind(Variable=fc$names,
Formula=as.character(theCharformulas[paste("La_",fc$names,sep="")]),
Class=fc$membership,
ResidualVariance=round(varratios[fc$names],3),
Fscore=round(fscores[fc$names],3)
)
)
rownames(clustable) <- str_replace_all(rownames(clustable),"__","_")
clustable$Variable <- NULL
clustable$Class <- as.integer(clustable$Class)
clustable$ResidualVariance <- as.numeric(clustable$ResidualVariance)
clustable$Fscore <- as.numeric(clustable$Fscore)
clustable <- clustable[order(-clustable$Fscore),]
clustable <- clustable[order(clustable$Class),]
clustable <- clustable[clustable$Fscore >= -1,]
topv <- min(50,nrow(clustable))
clustable <- clustable[1:topv,]
}
pander::pander(clustable)
| Formula | Class | ResidualVariance | Fscore | |
|---|---|---|---|---|
| X601 | - (1.083)X466 + X601 | 1 | 0.161 | 97 |
| X678 | - (0.881)X601 + X678 | 1 | 0.026 | 10 |
| X724 | - (0.766)X601 + X724 | 1 | 0.048 | 8 |
| X629 | - (1.019)X601 + X629 | 1 | 0.034 | 7 |
| X558 | + X558 - (0.921)X601 | 1 | 0.041 | 4 |
| X548 | + X548 - (0.877)X601 | 1 | 0.043 | 3 |
| X648 | - (0.960)X601 + X648 | 1 | 0.022 | 3 |
| X641 | - (0.929)X601 + X641 | 1 | 0.036 | 2 |
| X703 | - (0.820)X601 + X703 | 1 | 0.036 | 2 |
| X749 | - (0.018)X601 - (0.854)X724 + X749 | 1 | 0.018 | 1 |
| X767 | + (0.044)X601 - (0.880)X724 + X767 | 1 | 0.024 | 1 |
| X593 | + X593 - (0.258)X601 - (0.723)X629 | 1 | 0.005 | 0 |
| X617 | - (0.239)X601 + X617 - (0.736)X629 | 1 | 0.010 | 0 |
| X662 | - (0.086)X601 + X662 - (0.924)X678 | 1 | 0.010 | 0 |
| X694 | - (0.030)X601 - (0.903)X678 + X694 | 1 | 0.010 | 0 |
| X715 | - (0.053)X601 - (0.827)X678 + X715 | 1 | 0.011 | 0 |
| X733 | - (0.054)X601 - (0.859)X724 + X733 | 1 | 0.013 | 0 |
| X657 | - (0.253)X601 + X657 - (0.761)X678 | 1 | 0.004 | -1 |
| X669 | - (0.229)X601 + X669 - (0.755)X678 | 1 | 0.006 | -1 |
| X676 | - (0.065)X601 + X676 - (0.926)X678 | 1 | 0.004 | -1 |
| X685 | + (0.170)X601 - (1.163)X678 + X685 | 1 | 0.016 | -1 |
| X710 | - (0.216)X601 + X710 - (0.778)X724 | 1 | 0.008 | -1 |
| X1453 | NA | 2 | 1.000 | 69 |
| X1187 | + X1187 + (0.177)X1453 | 2 | 0.191 | 17 |
| X1448 | + X1448 - (0.702)X1453 | 2 | 0.026 | 13 |
| X1464 | - (0.890)X1453 + X1464 | 2 | 0.083 | 9 |
| X1466 | + (0.062)X1453 - (0.840)X1464 + X1466 | 2 | 0.002 | 5 |
| X958 | + X958 - (0.093)X1453 | 2 | 0.166 | 3 |
| X1243 | + X1243 + (0.057)X1453 | 2 | 0.336 | 3 |
| X943 | + X943 - (0.143)X1453 | 2 | 0.184 | 2 |
| X1033 | + X1033 + (0.641)X1448 - (0.666)X1453 | 2 | 0.035 | 2 |
| X1113 | + X1113 - (0.504)X1187 | 2 | 0.103 | 2 |
| X1231 | + X1231 + (0.074)X1453 | 2 | 0.340 | 2 |
| X1310 | + X1310 - (0.844)X1448 + (0.593)X1453 | 2 | 0.278 | 2 |
| X1386 | + X1386 + (0.215)X1453 | 2 | 0.033 | 2 |
| X1393 | + X1393 + (0.160)X1453 | 2 | 0.047 | 2 |
| X1438 | + X1438 - (0.584)X1453 | 2 | 0.038 | 2 |
| X1273 | + X1273 - (0.453)X1448 + (0.438)X1453 | 2 | 0.070 | 1 |
| X1431 | + X1431 - (0.584)X1453 + (0.345)X1464 | 2 | 0.036 | 1 |
| X1303 | + X1303 - (0.644)X1448 + (0.452)X1453 | 2 | 0.319 | 0 |
| X1314 | + X1314 - (0.887)X1448 + (0.623)X1453 | 2 | 0.168 | -1 |
| X1357 | + X1357 - (0.173)X1453 + (0.194)X1464 | 2 | 0.171 | -1 |
| X1363 | + X1363 + (0.277)X1453 - (0.191)X1464 | 2 | 0.062 | -1 |
| X1446 | + X1446 - (1.052)X1448 + (0.107)X1453 | 2 | 0.001 | -1 |
| X1455 | + (0.382)X1448 - (1.378)X1453 + X1455 | 2 | 0.001 | -1 |
| X466 | NA | 3 | 1.000 | 36 |
| X510 | - (0.933)X466 + X510 | 3 | 0.119 | 11 |
| X434 | + X434 - (1.128)X466 | 3 | 0.064 | 8 |
| X420 | + X420 - (1.043)X434 + (0.173)X466 | 3 | 0.017 | 4 |
| X454 | + X454 - (0.845)X466 | 3 | 0.126 | 3 |
par(op)
if (!largeSet)
{
cormat <- cor(DEdataframe[,varlistc],method="pearson")
cormat[is.na(cormat)] <- 0
gplots::heatmap.2(abs(cormat),
trace = "none",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Correlation after ILAA",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="|Pearson Correlation|",
xlab="Feature", ylab="Feature")
par(op)
diag(cormat) <- 0
print(max(abs(cormat)))
}
[1]
0.799788
classes <- unique(dataframe[1:numsub,outcome])
raincolors <- rainbow(length(classes))
names(raincolors) <- classes
topvars <- univariate_BinEnsemble(dataframe,outcome)
lso <- LASSO_MIN(formula(paste(outcome,"~.")),dataframe,family="binomial")
topvars <- unique(c(names(topvars),lso$selectedfeatures))
pander::pander(head(topvars))
X1255, X1757, X1761, X1762, X1780 and X1782
# names(topvars)
#if (nrow(dataframe) < 1000)
#{
datasetframe.umap = umap(scale(dataframe[1:numsub,topvars]),n_components=2)
# datasetframe.umap = umap(dataframe[1:numsub,varlist],n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
text(datasetframe.umap$layout,labels=dataframe[1:numsub,outcome],col=raincolors[dataframe[1:numsub,outcome]+1])
#}
varlistcV <- names(varratio[varratio >= 0.01])
topvars <- univariate_BinEnsemble(DEdataframe[,varlistcV],outcome)
lso <- LASSO_MIN(formula(paste(outcome,"~.")),DEdataframe[,varlistcV],family="binomial")
topvars <- unique(c(names(topvars),lso$selectedfeatures))
pander::pander(head(topvars))
X1255, X2082, X1942, X2056, La_X1201 and La_X1353
varlistcV <- varlistcV[varlistcV != outcome]
# DEdataframe[,outcome] <- as.numeric(DEdataframe[,outcome])
#if (nrow(dataframe) < 1000)
#{
datasetframe.umap = umap(scale(DEdataframe[1:numsub,topvars]),n_components=2)
# datasetframe.umap = umap(DEdataframe[1:numsub,varlistcV],n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After ILAA",t='n')
text(datasetframe.umap$layout,labels=DEdataframe[1:numsub,outcome],col=raincolors[DEdataframe[1:numsub,outcome]+1])
#}
univarRAW <- uniRankVar(varlist,
paste(outcome,"~1"),
outcome,
dataframe,
rankingTest="AUC")
100 : X641 200 : X869 300 : X1088 400 : X1293 500 : X1486
600 : X1667 700 : X1833 800 : X1985
univarDe <- uniRankVar(varlistc,
paste(outcome,"~1"),
outcome,
DEdataframe,
rankingTest="AUC",
)
100 : La_X641 200 : La_X869 300 : X1088 400 : La_X1293 500 :
X1486
600 : La_X1667 700 : La_X1833 800 : La_X1985
univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")
##top variables
topvar <- c(1:length(varlist)) <= TopVariables
tableRaw <- univarRAW$orderframe[topvar,univariate_columns]
pander::pander(tableRaw)
| caseMean | caseStd | controlMean | controlStd | controlKSP | ROCAUC | |
|---|---|---|---|---|---|---|
| X2078 | -0.00831 | 0.000859 | -0.00645 | 0.000455 | 0.990 | 0.944 |
| X2045 | -0.01737 | 0.000858 | -0.01574 | 0.000713 | 0.388 | 0.937 |
| X2080 | -0.00870 | 0.000957 | -0.00671 | 0.000464 | 0.985 | 0.936 |
| X2086 | -0.01137 | 0.001083 | -0.00908 | 0.000517 | 0.783 | 0.935 |
| X2046 | -0.01814 | 0.000941 | -0.01636 | 0.000604 | 0.882 | 0.935 |
| X2077 | -0.00826 | 0.000843 | -0.00674 | 0.000458 | 0.959 | 0.934 |
| X2085 | -0.01060 | 0.001052 | -0.00825 | 0.000465 | 0.799 | 0.934 |
| X2088 | -0.01167 | 0.000953 | -0.00970 | 0.000500 | 0.898 | 0.932 |
| X2048 | -0.01880 | 0.000968 | -0.01698 | 0.000660 | 0.672 | 0.928 |
| X2049 | -0.01907 | 0.001017 | -0.01718 | 0.000782 | 0.868 | 0.926 |
topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]
pander::pander(finalTable)
| caseMean | caseStd | controlMean | controlStd | controlKSP | ROCAUC | |
|---|---|---|---|---|---|---|
| La_X1575 | -0.008555 | 0.001137 | -0.01036 | 0.000625 | 0.973 | 0.922 |
| X2082 | -0.009254 | 0.001368 | -0.00729 | 0.000440 | 0.909 | 0.918 |
| La_X415 | 0.000796 | 0.001516 | -0.00160 | 0.000915 | 0.338 | 0.907 |
| La_X969 | 0.001040 | 0.002240 | 0.00537 | 0.002658 | 0.616 | 0.907 |
| La_X598 | -0.001178 | 0.000705 | -0.00241 | 0.000643 | 0.219 | 0.904 |
| La_X407 | -0.008368 | 0.002743 | -0.01202 | 0.001372 | 0.395 | 0.902 |
| La_X1279 | 0.011742 | 0.000975 | 0.01012 | 0.000804 | 1.000 | 0.902 |
| La_X1027 | 0.007151 | 0.000999 | 0.00552 | 0.000784 | 0.707 | 0.898 |
| La_X1201 | -0.006897 | 0.000999 | -0.00870 | 0.001005 | 0.769 | 0.898 |
| X2056 | -0.021217 | 0.001332 | -0.01900 | 0.001003 | 0.709 | 0.896 |
dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")
pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
| mean | total | fraction |
|---|---|---|
| 2.29 | 816 | 0.917 |
theCharformulas <- attr(dc,"LatentCharFormulas")
topvar <- rownames(tableRaw)
finalTable <- rbind(finalTable,tableRaw[topvar[!(topvar %in% topLAvar)],univariate_columns])
orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- theCharformulas[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]
finalTable$varratio <- varratio[rownames(finalTable)]
Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores","varratio")
finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
| DecorFormula | caseMean | caseStd | controlMean | controlStd | controlKSP | ROCAUC | RAWAUC | fscores | varratio | |
|---|---|---|---|---|---|---|---|---|---|---|
| X2078 | NA | -0.008314 | 0.000859 | -0.00645 | 0.000455 | 0.990 | 0.944 | 0.944 | NA | NA |
| X2045 | NA | -0.017370 | 0.000858 | -0.01574 | 0.000713 | 0.388 | 0.937 | 0.937 | NA | NA |
| X2080 | NA | -0.008703 | 0.000957 | -0.00671 | 0.000464 | 0.985 | 0.936 | 0.936 | NA | NA |
| X2086 | NA | -0.011371 | 0.001083 | -0.00908 | 0.000517 | 0.783 | 0.935 | 0.935 | NA | NA |
| X2046 | NA | -0.018144 | 0.000941 | -0.01636 | 0.000604 | 0.882 | 0.935 | 0.935 | NA | NA |
| X2077 | NA | -0.008257 | 0.000843 | -0.00674 | 0.000458 | 0.959 | 0.934 | 0.934 | NA | NA |
| X2085 | NA | -0.010600 | 0.001052 | -0.00825 | 0.000465 | 0.799 | 0.934 | 0.934 | NA | NA |
| X2088 | NA | -0.011674 | 0.000953 | -0.00970 | 0.000500 | 0.898 | 0.932 | 0.932 | NA | NA |
| X2048 | NA | -0.018797 | 0.000968 | -0.01698 | 0.000660 | 0.672 | 0.928 | 0.928 | NA | NA |
| X2049 | NA | -0.019069 | 0.001017 | -0.01718 | 0.000782 | 0.868 | 0.926 | 0.926 | NA | NA |
| La_X1575 | + X1575 - (0.979)X1584 | -0.008555 | 0.001137 | -0.01036 | 0.000625 | 0.973 | 0.922 | 0.510 | -1 | 0.07454 |
| X2082 | NA | -0.009254 | 0.001368 | -0.00729 | 0.000440 | 0.909 | 0.918 | 0.918 | 0 | 1.00000 |
| La_X415 | + X415 - (1.502)X417 + (0.492)X420 | 0.000796 | 0.001516 | -0.00160 | 0.000915 | 0.338 | 0.907 | 0.731 | -2 | 0.01915 |
| La_X969 | - (1.406)X967 + X969 | 0.001040 | 0.002240 | 0.00537 | 0.002658 | 0.616 | 0.907 | 0.595 | -1 | 0.03456 |
| La_X598 | + X598 - (0.997)X601 | -0.001178 | 0.000705 | -0.00241 | 0.000643 | 0.219 | 0.904 | 0.574 | -1 | 0.00388 |
| La_X407 | + X407 - (0.805)X420 | -0.008368 | 0.002743 | -0.01202 | 0.001372 | 0.395 | 0.902 | 0.763 | 1 | 0.06709 |
| La_X1279 | + X1279 - (1.078)X1283 | 0.011742 | 0.000975 | 0.01012 | 0.000804 | 1.000 | 0.902 | 0.561 | 0 | 0.04362 |
| La_X1027 | + X1027 - (0.716)X1029 | 0.007151 | 0.000999 | 0.00552 | 0.000784 | 0.707 | 0.898 | 0.674 | -1 | 0.06223 |
| La_X1201 | - (0.536)X1187 + X1201 | -0.006897 | 0.000999 | -0.00870 | 0.001005 | 0.769 | 0.898 | 0.710 | 1 | 0.10967 |
| X2056 | NA | -0.021217 | 0.001332 | -0.01900 | 0.001003 | 0.709 | 0.896 | 0.896 | 7 | 1.00000 |
featuresnames <- colnames(dataframe)[colnames(dataframe) != outcome]
pc <- prcomp(dataframe[,iscontinous],center = TRUE,scale. = TRUE,tol=0.01) #principal components
predPCA <- predict(pc,dataframe[,iscontinous])
PCAdataframe <- as.data.frame(cbind(predPCA,dataframe[,!iscontinous]))
colnames(PCAdataframe) <- c(colnames(predPCA),colnames(dataframe)[!iscontinous])
#plot(PCAdataframe[,colnames(PCAdataframe)!=outcome],col=dataframe[,outcome],cex=0.65,cex.lab=0.5,cex.axis=0.75,cex.sub=0.5,cex.main=0.75)
#pander::pander(pc$rotation)
PCACor <- cor(PCAdataframe[,colnames(PCAdataframe) != outcome])
gplots::heatmap.2(abs(PCACor),
trace = "none",
# scale = "row",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "PCA Correlation",
cexRow = 0.5,
cexCol = 0.5,
srtCol=45,
srtRow= -45,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")
EFAdataframe <- dataframeScaled
if (length(iscontinous) < 2000)
{
topred <- min(length(iscontinous),nrow(dataframeScaled),ncol(predPCA)-1)
if (topred < 2) topred <- 2
uls <- fa(dataframeScaled[,iscontinous],nfactors=topred,rotate="varimax",warnings=FALSE) # EFA analysis
predEFA <- predict(uls,dataframeScaled[,iscontinous])
EFAdataframe <- as.data.frame(cbind(predEFA,dataframeScaled[,!iscontinous]))
colnames(EFAdataframe) <- c(colnames(predEFA),colnames(dataframeScaled)[!iscontinous])
EFACor <- cor(EFAdataframe[,colnames(EFAdataframe) != outcome])
gplots::heatmap.2(abs(EFACor),
trace = "none",
# scale = "row",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "EFA Correlation",
cexRow = 0.5,
cexCol = 0.5,
srtCol=45,
srtRow= -45,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")
}
par(op)
par(xpd = TRUE)
dataframe[,outcome] <- factor(dataframe[,outcome])
rawmodel <- rpart(paste(outcome,"~."),dataframe,control=rpart.control(maxdepth=3))
pr <- predict(rawmodel,dataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
plot(rawmodel,main="Raw",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
text(rawmodel, use.n = TRUE,cex=0.75)
ptab <- epiR::epi.tests(table(pr==0,dataframe[,outcome]==0))
}
pander::pander(table(dataframe[,outcome],pr))
| 0 | 1 | |
|---|---|---|
| 0 | 50 | 0 |
| 1 | 6 | 47 |
pander::pander(ptab$detail[c(5,3,4,6),])
| statistic | est | lower | upper | |
|---|---|---|---|---|
| 5 | diag.ac | 0.942 | 0.878 | 0.978 |
| 3 | se | 0.887 | 0.770 | 0.957 |
| 4 | sp | 1.000 | 0.929 | 1.000 |
| 6 | diag.or | Inf | NA | Inf |
par(op)
par(xpd = TRUE)
DEdataframe[,outcome] <- factor(DEdataframe[,outcome])
IDeAmodel <- rpart(paste(outcome,"~."),DEdataframe[,c(outcome,varlistcV)],control=rpart.control(maxdepth=3))
pr <- predict(IDeAmodel,DEdataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
plot(IDeAmodel,main="ILAA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
text(IDeAmodel, use.n = TRUE,cex=0.75)
ptab <- epiR::epi.tests(table(pr==0,DEdataframe[,outcome]==0))
}
pander::pander(table(DEdataframe[,outcome],pr))
| 0 | 1 | |
|---|---|---|
| 0 | 50 | 0 |
| 1 | 6 | 47 |
pander::pander(ptab$detail[c(5,3,4,6),])
| statistic | est | lower | upper | |
|---|---|---|---|---|
| 5 | diag.ac | 0.942 | 0.878 | 0.978 |
| 3 | se | 0.887 | 0.770 | 0.957 |
| 4 | sp | 1.000 | 0.929 | 1.000 |
| 6 | diag.or | Inf | NA | Inf |
par(op)
par(xpd = TRUE)
PCAdataframe[,outcome] <- factor(PCAdataframe[,outcome])
PCAmodel <- rpart(paste(outcome,"~."),PCAdataframe,control=rpart.control(maxdepth=3))
pr <- predict(PCAmodel,PCAdataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
plot(PCAmodel,main="PCA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
text(PCAmodel, use.n = TRUE,cex=0.75)
ptab <- epiR::epi.tests(table(pr==0,PCAdataframe[,outcome]==0))
}
pander::pander(table(PCAdataframe[,outcome],pr))
| 0 | 1 | |
|---|---|---|
| 0 | 44 | 6 |
| 1 | 4 | 49 |
pander::pander(ptab$detail[c(5,3,4,6),])
| statistic | est | lower | upper | |
|---|---|---|---|---|
| 5 | diag.ac | 0.903 | 0.829 | 0.952 |
| 3 | se | 0.925 | 0.818 | 0.979 |
| 4 | sp | 0.880 | 0.757 | 0.955 |
| 6 | diag.or | 89.833 | 23.782 | 339.333 |
par(op)
EFAdataframe[,outcome] <- factor(EFAdataframe[,outcome])
EFAmodel <- rpart(paste(outcome,"~."),EFAdataframe,control=rpart.control(maxdepth=3))
pr <- predict(EFAmodel,EFAdataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
plot(EFAmodel,main="EFA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
text(EFAmodel, use.n = TRUE,cex=0.75)
ptab <- epiR::epi.tests(table(pr==0,EFAdataframe[,outcome]==0))
}
pander::pander(table(EFAdataframe[,outcome],pr))
| 0 | 1 | |
|---|---|---|
| 0 | 47 | 3 |
| 1 | 11 | 42 |
pander::pander(ptab$detail[c(5,3,4,6),])
| statistic | est | lower | upper | |
|---|---|---|---|---|
| 5 | diag.ac | 0.864 | 0.782 | 0.924 |
| 3 | se | 0.792 | 0.659 | 0.892 |
| 4 | sp | 0.940 | 0.835 | 0.987 |
| 6 | diag.or | 59.818 | 15.621 | 229.071 |
par(op)